library(DT)
library(dplyr)
library(readxl)
library(plotly)
library(skimr)
library(ggplot2)
library(gridExtra)
library(tidyr)
library(GGally)
library(ggcorrplot)
file_path = "../data/master_df.xlsx"
df = read_excel(file_path)
datatable(head(df, 10),
options = list(scrollX = TRUE,
pageLength = 5,
autoWidth = TRUE,
searching = FALSE),
caption = "Preview of Trade and Congressional Data")
skim_df = skim(df) %>%
rename(variable_type = skim_type,
variable_name = skim_variable,
missing_values = n_missing,
min_value = POSIXct.min,
max_value = POSIXct.max,
median_value = POSIXct.median,
unique_values = POSIXct.n_unique,
)
datatable(skim_df, options = list(scrollX = TRUE,
pageLength = 10,
autoWidth = TRUE,
searching = FALSE),
caption = "Summary Statistics for Data")
numerical_vars = c("trade_price", "price_change_pct", "seniority_years", "excess_return")
create_histogram = function(var) {ggplot(df, aes(x = .data[[var]])) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", color = "black", alpha = 0.7) +
geom_density(color = "red", size = 1) +
labs(title = paste("Distribution of", var), x = var, y = "Density") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12))}
histogram_plots = lapply(numerical_vars, create_histogram)
grid.arrange(grobs = histogram_plots, ncol = 2)
create_interactive_barplot = function(var) {
plot_ly(data = df, y = ~reorder(.data[[var]], table(.data[[var]])[.data[[var]]]),
type = "bar", orientation = "h", marker = list(color = "steelblue")) %>%
layout(title = paste("Distribution of", var), xaxis = list(title = "Count"),
yaxis = list(title = var))}
transaction_plot <- create_interactive_barplot("transaction_type")
party_plot <- create_interactive_barplot("party")
state_plot <- create_interactive_barplot("state")
gender_plot <- create_interactive_barplot("gender")
transaction_plot
party_plot
state_plot
gender_plot
define_trade_success = function(row) {
if (row$transaction_type %in% c("Purchase")) {
return(ifelse(row$price_change_pct > 0, 1, 0))}
else if (row$transaction_type %in% c("Sale", "Sale (Partial)", "Sale (Full)")) {
return(ifelse(row$price_change_pct < 0, 1, 0))}
else {return(NA)}}
df = df %>%
rowwise() %>%
mutate(trade_success = define_trade_success(cur_data())) %>%
ungroup() %>%
filter(!is.na(trade_success))
define_market_success = function(row) {
if (row$transaction_type %in% c("Purchase")) {
return(ifelse(row$excess_return > 0, 1, 0))}
else if (row$transaction_type %in% c("Sale", "Sale (Partial)", "Sale (Full)")) {
return(ifelse(row$excess_return < 0 & row$price_change_pct < 0, 1, 0))}
else {return(NA)}}
df = df %>%
rowwise() %>%
mutate(market_success = define_market_success(cur_data())) %>%
ungroup() %>%
filter(!is.na(market_success))
success_colors = c("0" = "red", "1" = "green")
success_plot = ggplot(df, aes(x = factor(trade_success),
fill = factor(trade_success))) + geom_bar() +
scale_fill_manual(values = success_colors, labels = c("0: Unsuccessful", "1: Successful")) +
labs(title = "Distribution of Trade Success Outcomes",
x = "Trade Success", y = "Count", fill = "Outcome") +
theme(plot.title = element_text(hjust = 0.5, size = 14),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = "top")
market_success_plot = ggplot(df, aes(x = factor(market_success),
fill = factor(market_success))) + geom_bar() +
scale_fill_manual(values = success_colors, labels = c("0: Unsuccessful", "1: Successful")) +
labs(title = "Distribution of Market Success Outcomes",
x = "Market Success", y = "Count", fill = "Outcome") +
theme(plot.title = element_text(hjust = 0.5, size = 14),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = "top" )
grid.arrange(success_plot, market_success_plot, ncol = 2)
z = qnorm(0.975)
compute_confidence_interval = function(successes, total) {
p = successes / total
se = sqrt(p * (1 - p) / total)
lower = max(0, p - z * se)
upper = min(1, p + z * se)
return(list(proportion = p, lower_bound = lower, upper_bound = upper))}
committee_columns = grep("committee_", colnames(df), value = TRUE)
committee_success_rates = data.frame(committee = character(),
success_rate = numeric(),
lower_ci = numeric(),
upper_ci = numeric(),
total_trades = integer(),
stringsAsFactors = FALSE)
for (committee in committee_columns) {committee_name = gsub("committee_", "", committee)
committee_data = df %>% filter(.data[[committee]] == 1)
total_trades = nrow(committee_data)
successes = nrow(committee_data %>% filter(trade_success == 1))
if (total_trades > 0) {
ci = compute_confidence_interval(successes, total_trades)
committee_success_rates = rbind(committee_success_rates,
data.frame(committee = committee_name,
success_rate = ci$proportion,
lower_ci = ci$lower_bound,
upper_ci = ci$upper_bound,
total_trades = total_trades))}
else {committee_success_rates = rbind(committee_success_rates,
data.frame(committee = committee_name,
success_rate = NA,
lower_ci = NA,
upper_ci = NA,
total_trades = total_trades))}}
committee_success_rates = committee_success_rates %>% filter(!is.na(success_rate))
committee_success_rates = committee_success_rates %>% arrange(desc(success_rate))
ggplot(committee_success_rates, aes(x = success_rate, y = reorder(committee, success_rate))) +
geom_bar(stat = "identity", fill = "skyblue", width = 0.8) +
geom_errorbar(aes(xmin = lower_ci, xmax = upper_ci),
width = 0.2, color = "black") +
labs(title = "Success Rate by Committee with Confidence Intervals",
x = "Success Rate", y = "Committee") + theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 16, margin = margin(b = 10)),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12))
max_trades = max(committee_success_rates$total_trades, na.rm = TRUE)
committee_success_rates = committee_success_rates %>%
mutate(bubble_size = total_trades / max_trades * 1000)
ggplot(committee_success_rates, aes(x = success_rate, y = reorder(committee, success_rate), size = bubble_size)) +
geom_point(alpha = 0.6, color = "steelblue") +
scale_size_continuous(range = c(3, 20),
name = "Number of Trades",
breaks = c(100, 500, 1000),
labels = c("100 trades", "500 trades", "1000 trades")) +
labs(title = "Success Rate by Committee (Bubble Size = Number of Trades)",
x = "Success Rate", y = "Committee") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = "right" )
min_trades = 30
filtered_committee_df = committee_success_rates %>%
filter(total_trades >= min_trades)
ggplot(filtered_committee_df, aes(x = success_rate, y = reorder(committee, success_rate))) +
geom_bar(stat = "identity", fill = "skyblue", width = 0.8) +
labs(title = paste("Success Rate by Committee (Minimum", min_trades, "Trades)"),
x = "Success Rate", y = "Committee") + theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 16, margin = margin(b = 10)),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
plot.margin = margin(t = 20, r = 20, b = 20, l = 20))
min_trades = 30
sector_success_rates = df %>% filter(!is.na(sector)) %>%
group_by(sector) %>%
summarise(total_trades = n(),
successes = sum(trade_success, na.rm = TRUE),
.groups = "drop") %>%
rowwise() %>%
mutate(ci = list(compute_confidence_interval(successes, total_trades)),
success_rate = ci$proportion,
lower_ci = ci$lower_bound,
upper_ci = ci$upper_bound) %>%
unnest_wider(ci) %>%
filter(total_trades >= min_trades)
ggplot(sector_success_rates, aes(x = success_rate, y = reorder(sector, success_rate))) +
geom_bar(stat = "identity", fill = "skyblue", width = 0.8) +
geom_errorbar(aes(xmin = lower_ci, xmax = upper_ci),
width = 0.2, color = "black") +
geom_text(aes(label = paste0(total_trades, " trades"), x = success_rate + 0.01),
hjust = 0, size = 3) +
labs(title = "Success Rate by Sector with Confidence Intervals",
x = "Success Rate", y = "Sector") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
plot.margin = margin(t = 20, r = 20, b = 20, l = 20))
ggplot(df, aes(x = seniority_years, y = trade_success)) +
geom_jitter(width = 0.2, height = 0.05, color = "blue", alpha = 0.6) +
labs(title = "Success vs. Seniority Years",
x = "Seniority (Years)", y = "Success (1 = Successful Trade)") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12))
ggplot(df, aes(x = factor(trade_success), y = seniority_years)) +
geom_boxplot(fill = "skyblue", color = "black", width = 0.5) +
labs(title = "Seniority Years by Success",
x = "Success (0 = Unsuccessful, 1 = Successful)",
y = "Seniority (Years)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 14),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12))
party_success = df %>%
group_by(party) %>%
summarise(average_success_rate = mean(trade_success, na.rm = TRUE),
.groups = "drop")
ggplot(party_success, aes(x = party, y = average_success_rate)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
labs(title = "Success Rate by Party", x = "Party", y = "Average Success Rate") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12))
pairplot_vars = c("trade_price", "price_change_pct", "seniority_years", "excess_return", "trade_success")
pairplot_data = df %>%
select(all_of(pairplot_vars)) %>%
mutate(trade_success = as.factor(trade_success))
ggpairs(pairplot_data, aes(color = trade_success, alpha = 0.6),
lower = list(continuous = wrap("points", alpha = 0.4)),
upper = list(continuous = wrap("cor", size = 4)),
diag = list(continuous = wrap("densityDiag", alpha = 0.4))) +
theme_minimal() + theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5, size = 14)) +
labs(title = "Pair Plot of Selected Variables by Trade Success")
df = df %>%
mutate(party_encoded = as.numeric(as.factor(party)),
gender_encoded = as.numeric(as.factor(gender)))
corr_vars = c("trade_price", "price_change_pct", "seniority_years",
"excess_return", "trade_success", "party_encoded", "gender_encoded")
subset_df = df %>% select(all_of(corr_vars))
corr_matrix = cor(subset_df, use = "complete.obs")
ggcorrplot(corr_matrix,
method = "square",
type = "full",
lab = TRUE,
lab_size = 4,
colors = c("red", "white", "blue"),
title = "Correlation Matrix Including Encoded Categorical Variables",
legend.title = "Correlation") + theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 16),
legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1))
committee_columns = grep("committee_", names(df), value = TRUE)
committee_df = df %>% select(all_of(committee_columns))
committee_corr = cor(committee_df, use = "complete.obs")
colnames(committee_corr) = abbreviate(colnames(committee_corr), minlength = 15)
rownames(committee_corr) = abbreviate(rownames(committee_corr), minlength = 15)
ggcorrplot(committee_corr,
method = "square",
type = "full",
lab = FALSE,
colors = c("red", "white", "blue"),
title = "Correlation Matrix of Committee Memberships",
legend.title = "Correlation") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
legend.position = "right",
axis.text.x = element_text(size = 8, angle = 90, hjust = 1),
axis.text.y = element_text(size = 8))
df = df %>%
mutate(num_committees = rowSums(select(., starts_with("committee_")), na.rm = TRUE))
success_by_committees <- df %>%
group_by(num_committees) %>%
summarise(average_success_rate = mean(trade_success, na.rm = TRUE),
.groups = "drop")
ggplot(success_by_committees, aes(x = num_committees, y = average_success_rate)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
labs(title = "Success Rate by Number of Committee Memberships",
x = "Number of Committees", y = "Average Success Rate") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12))
df = df %>%
mutate(trade_date = as.Date(trade_date))
df = df %>%
mutate(trade_year = format(trade_date, "%Y"))
trades_per_year = df %>%
group_by(trade_year) %>%
summarise(num_trades = n(), .groups = "drop")
ggplot(trades_per_year, aes(x = as.numeric(trade_year), y = num_trades)) +
geom_line(group = 1, color = "blue", size = 1) +
geom_point(color = "red", size = 2) +
labs(title = "Number of Trades Over Time",
x = "Year", y = "Number of Trades") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12))
df = df %>%
mutate(trade_date = as.Date(trade_date),
trade_year = format(trade_date, "%Y"))
success_per_year = df %>%
group_by(trade_year) %>%
summarise(avg_success_rate = mean(trade_success, na.rm = TRUE),
.groups = "drop")
ggplot(success_per_year, aes(x = as.numeric(trade_year), y = avg_success_rate)) +
geom_line(group = 1, color = "blue", size = 1) +
geom_point(color = "red", size = 2) +
labs(title = "Success Rate Over Time",
x = "Year", y = "Average Success Rate") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12))
sector_success = df %>%
group_by(sector) %>%
summarise(avg_success_rate = mean(trade_success, na.rm = TRUE),
.groups = "drop")
ggplot(sector_success, aes(x = avg_success_rate, y = reorder(sector, avg_success_rate))) +
geom_bar(stat = "identity", fill = "darkorange", color = "black") +
labs(title = "Success Rate by Sector", x = "Average Success Rate", y = "Sector") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12))
sector_counts = df %>%
group_by(sector) %>%
summarise(num_trades = n(), .groups = "drop")
ggplot(sector_counts, aes(x = num_trades, y = reorder(sector, num_trades))) +
geom_bar(stat = "identity", fill = "purple", color = "black") +
labs(title = "Number of Trades by Sector", x = "Number of Trades", y = "Sector") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12))